NUID: 001877232
Class: CS5200
Academic Term: Summer-2, 2020
Instructor: Dr. Martin Schedlbauer
Email : Batra.am@northeastern.edu
Type : Individual Submission
GITHUB Repository: Click Here
Installation Screenshots:
Installing Mongo from Homebrew and setting default directory :
“mongod –dbpath /users/amanbatra/desktop/Data/db”
Part 1
Starting the Mongo DB Server
“mongo –host localhost 27017”
Part 2
Checking the MongoDB version “mongo –version”
Part 3
Checking the installation of MongoDB compass:
All databases have loaded correctly
Part 4
# Setting up the working directory
setwd("/users/amanbatra/desktop/Data/db")
#reading the data heads from db file
crimes=data.table::fread("Crimes_-_2001_to_Present.csv")
names(crimes)
## [1] "ID" "Case Number" "Date"
## [4] "Block" "IUCR" "Primary Type"
## [7] "Description" "Location Description" "Arrest"
## [10] "Domestic" "Beat" "District"
## [13] "Ward" "Community Area" "FBI Code"
## [16] "X Coordinate" "Y Coordinate" "Year"
## [19] "Updated On" "Latitude" "Longitude"
## [22] "Location"
#Remove space from column names, to make it easier for Mongo DB
names(crimes) = gsub(" ","",names(crimes))
names(crimes)
## [1] "ID" "CaseNumber" "Date"
## [4] "Block" "IUCR" "PrimaryType"
## [7] "Description" "LocationDescription" "Arrest"
## [10] "Domestic" "Beat" "District"
## [13] "Ward" "CommunityArea" "FBICode"
## [16] "XCoordinate" "YCoordinate" "Year"
## [19] "UpdatedOn" "Latitude" "Longitude"
## [22] "Location"
# Creating a new MongoDB collection object and loading the data into it
my_collection = mongo(collection = "crimes", db = "Chicago") # create connection, database and collection
my_collection$insert(crimes)
## List of 5
## $ nInserted : num 7168941
## $ nMatched : num 0
## $ nRemoved : num 0
## $ nUpserted : num 0
## $ writeErrors: list()
# Checking the data count
my_collection$count()
## [1] 7168941
# Fetching the first record from the R database object
my_collection$iterate()$one()
## $ID
## [1] 11034701
##
## $CaseNumber
## [1] "JA366925"
##
## $Date
## [1] "01/01/2001 11:00:00 AM"
##
## $Block
## [1] "016XX E 86TH PL"
##
## $IUCR
## [1] "1153"
##
## $PrimaryType
## [1] "DECEPTIVE PRACTICE"
##
## $Description
## [1] "FINANCIAL IDENTITY THEFT OVER $ 300"
##
## $LocationDescription
## [1] "RESIDENCE"
##
## $Arrest
## [1] FALSE
##
## $Domestic
## [1] FALSE
##
## $Beat
## [1] 412
##
## $District
## [1] 4
##
## $Ward
## [1] 8
##
## $CommunityArea
## [1] 45
##
## $FBICode
## [1] "11"
##
## $Year
## [1] 2001
##
## $UpdatedOn
## [1] "08/05/2017 03:50:08 PM"
##
## $Location
## [1] ""
# How many distinct “Primary Type” aka types of crimes do we have?
length(my_collection$distinct("PrimaryType"))
## [1] 36
# how many domestic assualts there are in the collection
my_collection$count('{"PrimaryType" : "ASSAULT", "Domestic" : true }')
## [1] 100443
# To get the filtered data and we can also retrieve only the columns of interest
query1= my_collection$find('{"PrimaryType" : "ASSAULT", "Domestic" : true }')
query2= my_collection$find('{"PrimaryType" : "ASSAULT", "Domestic" : true }',
fields = '{"_id":0, "PrimaryType":1, "Domestic":1}')
ncol(query1) # with all the columns
## [1] 22
ncol(query2) # only the selected columns
## [1] 2
Where do most crimes take place?
my_collection$aggregate('[{"$group":{"_id":"$LocationDescription", "Count": {"$sum":1}}}]')%>%na.omit()%>%
arrange(desc(Count))%>%head(10)%>%
ggplot(aes(x=reorder(`_id`,Count),y=Count))+
geom_bar(stat="identity",color='skyblue',fill='#b35900')+geom_text(aes(label = Count), color = "blue") +coord_flip()+xlab("Location Description")
If loading the entire dataset we are working with does not slow down our analysis, we can use data.table or dplyr but when dealing with big data, using MongoDB can give us performance boost as the whole data will not be loaded into mememory. We can reproduce the above plot without using MongoDB, like so:
crimes%>%group_by(`LocationDescription`)%>%summarise(Total=n())%>% arrange(desc(Total))%>%head(10)%>%
ggplot(aes(x=reorder(`LocationDescription`,Total),y=Total))+
geom_bar(stat="identity",color='skyblue',fill='#b35900')+geom_text(aes(label = Total), color = "blue") +coord_flip()+xlab("Location Description")
## `summarise()` ungrouping output (override with `.groups` argument)
Query to load only the columns we want and to save memory for our analysis.
query3= my_collection$find('{}', fields = '{"_id":0, "Latitude":1, "Longitude":1,"Year":1}')
options(max.print=500)
print.data.frame(query3)
## Year Latitude Longitude
## 1 2001 NA NA
## 2 2017 NA NA
## 3 2017 NA NA
## 4 2017 NA NA
## 5 2017 NA NA
## 6 2013 NA NA
## 7 2015 NA NA
## 8 2017 NA NA
## 9 2017 NA NA
## 10 2017 NA NA
## 11 2017 NA NA
## 12 2017 NA NA
## 13 2012 NA NA
## 14 2017 NA NA
## 15 2017 NA NA
## 16 2017 NA NA
## 17 2014 NA NA
## 18 2015 NA NA
## 19 2015 NA NA
## 20 2017 NA NA
## 21 2011 NA NA
## 22 2013 NA NA
## 23 2007 NA NA
## 24 2011 NA NA
## 25 2013 NA NA
## 26 2017 NA NA
## 27 2014 NA NA
## 28 2015 NA NA
## 29 2013 NA NA
## 30 2010 NA NA
## 31 2015 NA NA
## 32 2015 NA NA
## 33 2007 NA NA
## 34 2015 NA NA
## 35 2015 NA NA
## 36 2008 NA NA
## 37 2017 NA NA
## 38 2001 NA NA
## 39 2015 NA NA
## 40 2016 NA NA
## 41 2003 NA NA
## 42 2017 NA NA
## 43 2007 NA NA
## 44 2017 NA NA
## 45 2018 NA NA
## 46 2001 NA NA
## 47 2014 NA NA
## 48 2001 NA NA
## 49 2017 NA NA
## 50 2016 NA NA
## 51 2017 NA NA
## 52 2017 NA NA
## 53 2016 NA NA
## 54 2017 NA NA
## 55 2018 NA NA
## 56 2017 NA NA
## 57 2017 NA NA
## 58 2017 NA NA
## 59 2014 NA NA
## 60 2017 NA NA
## 61 2017 NA NA
## 62 2001 NA NA
## 63 2017 NA NA
## 64 2018 NA NA
## 65 2016 NA NA
## 66 2018 NA NA
## 67 2018 NA NA
## 68 2018 NA NA
## 69 2018 NA NA
## 70 2014 NA NA
## 71 2017 NA NA
## 72 2018 NA NA
## 73 2017 NA NA
## 74 2016 NA NA
## 75 2017 NA NA
## 76 2017 NA NA
## 77 2017 NA NA
## 78 2016 NA NA
## 79 2013 NA NA
## 80 2015 NA NA
## 81 2018 NA NA
## 82 2017 NA NA
## 83 2014 NA NA
## 84 2017 NA NA
## 85 2014 NA NA
## 86 2017 NA NA
## 87 2013 NA NA
## 88 2013 NA NA
## 89 2017 NA NA
## 90 2014 NA NA
## 91 2009 NA NA
## 92 2017 NA NA
## 93 2013 NA NA
## 94 2014 NA NA
## 95 2015 NA NA
## 96 2017 NA NA
## 97 2017 NA NA
## 98 2017 NA NA
## 99 2018 NA NA
## 100 2012 NA NA
## 101 2018 NA NA
## 102 2018 NA NA
## 103 2018 NA NA
## 104 2017 NA NA
## 105 2017 NA NA
## 106 2017 NA NA
## 107 2018 NA NA
## 108 2017 NA NA
## 109 2015 NA NA
## 110 2017 NA NA
## 111 2017 NA NA
## 112 2017 NA NA
## 113 2017 NA NA
## 114 2014 NA NA
## 115 2017 NA NA
## 116 2017 NA NA
## 117 2017 NA NA
## 118 2017 NA NA
## 119 2017 NA NA
## 120 2017 NA NA
## 121 2017 NA NA
## 122 2002 NA NA
## 123 2002 NA NA
## 124 2017 NA NA
## 125 2008 NA NA
## 126 2009 NA NA
## 127 2017 NA NA
## 128 2002 NA NA
## 129 2009 NA NA
## 130 2013 NA NA
## 131 2012 NA NA
## 132 2017 NA NA
## 133 2017 NA NA
## 134 2018 NA NA
## 135 2017 NA NA
## 136 2016 NA NA
## 137 2019 NA NA
## 138 2018 NA NA
## 139 2017 NA NA
## 140 2016 NA NA
## 141 2017 NA NA
## 142 2017 NA NA
## 143 2017 NA NA
## 144 2017 NA NA
## 145 2018 NA NA
## 146 2017 NA NA
## 147 2015 NA NA
## 148 2017 NA NA
## 149 2019 NA NA
## 150 2017 NA NA
## 151 2016 NA NA
## 152 2018 NA NA
## 153 2017 NA NA
## 154 2017 NA NA
## 155 2017 NA NA
## 156 2017 NA NA
## 157 2017 NA NA
## 158 2017 NA NA
## 159 2017 NA NA
## 160 2018 NA NA
## 161 2018 NA NA
## 162 2017 NA NA
## 163 2017 NA NA
## 164 2017 NA NA
## 165 2017 NA NA
## 166 2017 NA NA
## [ reached 'max' / getOption("max.print") -- omitted 7168775 rows ]
Exploring domestic crimes
We can explore any patterns of domestic crimes. For example, are they common in certain days/hours/months?
domestic=my_collection$find('{"Domestic":true}', fields = '{"_id":0, "Domestic":1,"Date":1}')
domestic$Date= mdy_hms(domestic$Date)
domestic$Weekday = weekdays(domestic$Date)
domestic$Hour = hour(domestic$Date)
domestic$month = month(domestic$Date,label=TRUE)
WeekdayCounts = as.data.frame(table(domestic$Weekday))
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday"))
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1),size=2,color="red") + xlab("Day of the Week") + ylab("Total Domestic Crimes")+
ggtitle("Domestic Crimes in the City of Chicago Since 2001")+
theme(axis.title.x=element_blank(),axis.text.y = element_text(color="blue",size=11,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=11,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=14),
plot.title=element_text(size=16,color="purple",hjust=0.5))
Domestic crimes are common over the weekend than in weekdays? What could be the reason? We can also see the pattern for each day by hour:
DayHourCounts = as.data.frame(table(domestic$Weekday, domestic$Hour))
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=1.4)+ylab("Count")+
ylab("Total Domestic Crimes")+ggtitle("Domestic Crimes in the City of Chicago Since 2001")+
theme(axis.title.x=element_text(size=14),axis.text.y = element_text(color="blue",size=11,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=11,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=14),
legend.title=element_blank(),
plot.title=element_text(size=16,color="purple",hjust=0.5))
The crimes peak mainly around mid-night. We can also use one color for weekdays and another color for weekend as shown below.
DayHourCounts$Type = ifelse((DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"), "Weekend", "Weekday")
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5) +
ylab("Total Domestic Crimes")+ggtitle("Domestic Crimes in the City of Chicago Since 2001")+
theme(axis.title.x=element_text(size=14),axis.text.y = element_text(color="blue",size=11,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=11,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=14),
legend.title=element_blank(),
plot.title=element_text(size=16,color="purple",hjust=0.5))
The difference between weekend and weekdays are more clear from this figure than from the previous plot. We can also see the above pattern from a heatmap.
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts", low="white", high="red") +
ggtitle("Domestic Crimes in the City of Chicago Since 2001")+theme(axis.title.y = element_blank())+ylab("")+
theme(axis.title.x=element_text(size=14),axis.text.y = element_text(size=13),
axis.text.x = element_text(size=13),
axis.title.y = element_text(size=14),
legend.title=element_blank(),
plot.title=element_text(size=16,color="purple",hjust=0.5))
From the heatmap, we can see more crimes over weekends and at night.
monthCounts = as.data.frame(table(domestic$month))
ggplot(monthCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1),size=2,color="red") + xlab("Day of the Week") + ylab("Total Domestic Crimes")+
ggtitle("Domestic Crimes in the City of Chicago Since 2001")+
theme(axis.title.x=element_blank(),axis.text.y = element_text(color="blue",size=11,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=11,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=14),
plot.title=element_text(size=16,color="purple",hjust=0.5))
Is domestic crime associated with temperature? Domestic crimes tremendously increases during the warm months. Now, let’s see the pattern of other crime types. Since there are 35 primary types, we cannot see all of them in this post. Let’s focus on four of the most common ones.
crimes=my_collection$find('{}', fields = '{"_id":0, "PrimaryType":1,"Year":1}')
crimes%>%group_by(PrimaryType)%>%summarize(Count=n())%>%arrange(desc(Count))%>%head(4)
## `summarise()` ungrouping output (override with `.groups` argument)
As shown in the table above, the most common crime type is theft followed by battery. Narcotics is fourth most common while criminal damage is the third most common crime type in the city of Chicago. Now, let’s generate plots by day and hour.
four_most_common=crimes%>%group_by(PrimaryType)%>%summarize(Count=n())%>%arrange(desc(Count))%>%head(4)
## `summarise()` ungrouping output (override with `.groups` argument)
four_most_common=four_most_common$PrimaryType
crimes=my_collection$find('{}', fields = '{"_id":0, "PrimaryType":1,"Date":1}')
crimes=filter(crimes,PrimaryType %in%four_most_common)
crimes$Date= mdy_hms(crimes$Date)
crimes$Weekday = weekdays(crimes$Date)
crimes$Hour = hour(crimes$Date)
crimes$month=month(crimes$Date,label = TRUE)
g = function(data){
WeekdayCounts = as.data.frame(table(data$Weekday))
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday"))
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1),size=2,color="red") + xlab("Day of the Week") +
theme(axis.title.x=element_blank(),axis.text.y = element_text(color="blue",size=10,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=10,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=11),
plot.title=element_text(size=12,color="purple",hjust=0.5))
}
g1=g(filter(crimes,PrimaryType=="THEFT"))+ggtitle("Theft")+ylab("Total Count")
g2=g(filter(crimes,PrimaryType=="BATTERY"))+ggtitle("BATTERY")+ylab("Total Count")
g3=g(filter(crimes,PrimaryType=="CRIMINAL DAMAGE"))+ggtitle("CRIMINAL DAMAGE")+ylab("Total Count")
g4=g(filter(crimes,PrimaryType=="NARCOTICS"))+ggtitle("NARCOTICS")+ylab("Total Count")
grid.arrange(g1,g2,g3,g4,ncol=2)
g=function(data){
monthCounts = as.data.frame(table(data$month))
ggplot(monthCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1),size=2,color="red") + xlab("Day of the Week") +
theme(axis.title.x=element_blank(),axis.text.y = element_text(color="blue",size=11,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=11,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=14),
plot.title=element_text(size=16,color="purple",hjust=0.5))
}
g1=g(filter(crimes,PrimaryType=="THEFT"))+ggtitle("Theft")+ylab("Total Count")
g2=g(filter(crimes,PrimaryType=="BATTERY"))+ggtitle("BATTERY")+ylab("Total Count")
g3=g(filter(crimes,PrimaryType=="CRIMINAL DAMAGE"))+ggtitle("CRIMINAL DAMAGE")+ylab("Total Count")
g4=g(filter(crimes,PrimaryType=="NARCOTICS"))+ggtitle("NARCOTICS")+ylab("Total Count")
grid.arrange(g1,g2,g3,g4,ncol=2)
g=function(data){
monthCounts = as.data.frame(table(data$month))
ggplot(monthCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1),size=2,color="red") + xlab("Day of the Week") +
theme(axis.title.x=element_blank(),axis.text.y = element_text(color="blue",size=11,angle=0,hjust=1,vjust=0),
axis.text.x = element_text(color="blue",size=11,angle=0,hjust=.5,vjust=.5),
axis.title.y = element_text(size=14),
plot.title=element_text(size=16,color="purple",hjust=0.5))
}
g1=g(filter(crimes,PrimaryType=="THEFT"))+ggtitle("Theft")+ylab("Total Count")
g2=g(filter(crimes,PrimaryType=="BATTERY"))+ggtitle("BATTERY")+ylab("Total Count")
g3=g(filter(crimes,PrimaryType=="CRIMINAL DAMAGE"))+ggtitle("CRIMINAL DAMAGE")+ylab("Total Count")
g4=g(filter(crimes,PrimaryType=="NARCOTICS"))+ggtitle("NARCOTICS")+ylab("Total Count")
grid.arrange(g1,g2,g3,g4,ncol=2)
Except, narcotics, all increase in the warmer months. Does this have any association with temperature? We can also produce maps
# Register the google api key
ggmap::register_google(key = "MySecretAPIKey")
# Load a map of Chicago into R:
chicago <- get_map(location = "chicago", zoom = 11)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=chicago&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=chicago&key=xxx
# Round our latitude and longitude to 2 digits of accuracy, and create a crime counts data frame for each area:
query3= my_collection$find('{}', fields = '{"_id":0, "Latitude":1, "Longitude":1,"Year":1}')
LatLonCounts=as.data.frame(table(round(query3$Longitude,2),round(query3$Latitude,2)))
# Convert our Longitude and Latitude variable to numbers:
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")+
ggtitle("Crime Distribution")+labs(alpha="Count")+theme(plot.title = element_text(hjust=0.5))
We can also see a map for domestic crimes only:
domestic=my_collection$find('{"Domestic":true}', fields = '{"_id":0, "Latitude":1, "Longitude":1,"Year":1}')
LatLonCounts=as.data.frame(table(round(domestic$Longitude,2),round(domestic$Latitude,2)))
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")+
ggtitle("Domestic Crimes")+labs(alpha="Count")+theme(plot.title = element_text(hjust=0.5))
Let’s see where motor vehicle theft is common:
mtheft=my_collection$find('{"PrimaryType":"MOTOR VEHICLE THEFT"}', fields = '{"_id":0, "Latitude":1, "Longitude":1,"Year":1}')
LatLonCounts=as.data.frame(table(round(mtheft$Longitude,2),round(mtheft$Latitude,2)))
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")+
ggtitle("Motor Vehicle Theft ")+labs(alpha="Count")+theme(plot.title = element_text(hjust=0.5))
Domestic crimes show concentration over two areas whereas motor vehicle theft is wide spread over large part of the city of Chicago.
Finding the count for each primary type of crime, and plotting heat maps or concentration maps for most common type of crime:
least_crimes=my_collection$find('{}', fields = '{"_id":0, "PrimaryType":1,"Year":1}')
least_crimes%>%group_by(PrimaryType)%>%summarize(Count=n())%>%arrange(desc(Count))%>%head(-1)
## `summarise()` ungrouping output (override with `.groups` argument)
my_collection$aggregate('[{"$group":{"_id":"$PrimaryType", "Count": {"$sum":1}}}]')%>%na.omit()%>%
arrange(desc(Count))%>%head(10)%>%
ggplot(aes(x=reorder(`_id`,Count),y=Count))+
geom_bar(stat="identity",color='lightgreen',fill='#b35900')+geom_text(aes(label = Count), color = "green") +coord_flip()+xlab("Types of Crimes")
# We can see that the most common type of crime is "THEFT", so lets make a heatmap for theft
mtheft=my_collection$find('{"PrimaryType":"THEFT"}', fields = '{"_id":0, "Latitude":1, "Longitude":1,"Year":1}')
LatLonCounts=as.data.frame(table(round(mtheft$Longitude,2),round(mtheft$Latitude,2)))
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")+
ggtitle("Theft")+labs(alpha="Count")+theme(plot.title = element_text(hjust=0.5))